## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## riskRegression version 2023.12.21
## Loading required package: prodlim
##
## Attaching package: 'pec'
## The following objects are masked from 'package:riskRegression':
##
## ipcw, selectCox
## The following object is masked from 'package:caret':
##
## R2
##
## randomForestSRC 3.3.1
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## subject time death cd4 time_obs treatment sex prev_infection
## 1 1 16.97 0 10.677078 0 ddC male AIDS
## 2 1 16.97 0 8.426150 6 ddC male AIDS
## 3 1 16.97 0 9.433981 12 ddC male AIDS
## 4 2 19.00 0 6.324555 0 ddI male noAIDS
## 5 2 19.00 0 8.124038 6 ddI male noAIDS
## 6 2 19.00 0 4.582576 12 ddI male noAIDS
## azt
## 1 intolerance
## 2 intolerance
## 3 intolerance
## 4 intolerance
## 5 intolerance
## 6 intolerance
## 'data.frame': 1405 obs. of 9 variables:
## $ subject : int 1 1 1 2 2 2 2 3 3 3 ...
## $ time : num 17 17 17 19 19 ...
## $ death : int 0 0 0 0 0 0 0 1 1 1 ...
## $ cd4 : num 10.68 8.43 9.43 6.32 8.12 ...
## $ time_obs : int 0 6 12 0 6 12 18 0 2 6 ...
## $ treatment : chr "ddC" "ddC" "ddC" "ddI" ...
## $ sex : chr "male" "male" "male" "male" ...
## $ prev_infection: chr "AIDS" "AIDS" "AIDS" "noAIDS" ...
## $ azt : chr "intolerance" "intolerance" "intolerance" "intolerance" ...
df$treatment <- as.factor(df$treatment)
df$sex <- as.factor(df$sex)
df$prev_infection<-as.factor(df$prev_infection)
df$azt <- as.factor(df$azt)
str(df)## 'data.frame': 1405 obs. of 9 variables:
## $ subject : int 1 1 1 2 2 2 2 3 3 3 ...
## $ time : num 17 17 17 19 19 ...
## $ death : int 0 0 0 0 0 0 0 1 1 1 ...
## $ cd4 : num 10.68 8.43 9.43 6.32 8.12 ...
## $ time_obs : int 0 6 12 0 6 12 18 0 2 6 ...
## $ treatment : Factor w/ 2 levels "ddC","ddI": 1 1 1 2 2 2 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 1 1 1 ...
## $ prev_infection: Factor w/ 2 levels "AIDS","noAIDS": 1 1 1 2 2 2 2 1 1 1 ...
## $ azt : Factor w/ 2 levels "failure","intolerance": 2 2 2 2 2 2 2 2 2 2 ...
## Nombre de données manquantes : 0
## Nombre de doublons : 0
CD4 Summary :
df$cd4_cat <- cut(df$cd4,
breaks = c(0, 7, 15, 24.125),
labels = c("Low", "Medium", "High"),
right = TRUE)
summary(df$cd4)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.162 5.477 7.023 10.440 24.125
histogram_plot = ggplot(df, aes(x = cd4)) +
geom_histogram(fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogramme de la variable CD4", x = "CD4", y = "Fréquence") +
theme_minimal()
density_plot = ggplot(df, aes(x = cd4)) +
geom_density(fill = "darkgreen", alpha = 0.5) +
labs(title = "Densité de la variable CD4", x = "CD4", y = "Densité") +
theme_minimal()
histogram_plot## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Sex distribution:
histo_sex_plot = ggplot(df, aes(x = sex)) +
geom_bar(fill = "lightblue", color = "black") +
labs(title = "Distribution du Sexe", x = "Sexe", y = "Nombre d'individus") +
theme_minimal()
cat("Résumé numérique de la variable sex : \n")## Résumé numérique de la variable sex :
##
## female male
## 117 1288
We notice that the vast majority of patients are men.
cd4_azt_plot = ggplot(df, aes(x = azt, y = cd4, fill = azt)) +
geom_boxplot() +
labs(title = "Boxplot de CD4 par Sexe", x = "AZT", y = "CD4") +
theme_minimal()
cd4_azt_plotggplot(df, aes(x = azt, y = cd4, fill = azt)) +
geom_boxplot() +
labs(title = "Boxplot de CD4 par Traitement", x = "Traitement", y = "CD4") +
theme_minimal()boxplot_infection = ggplot(df, aes(x = interaction(prev_infection, azt), y = cd4, fill = azt)) +
geom_boxplot() +
labs(title = "Boxplot de CD4 par Sexe et Infections", x = "Sexe et Inféctions", y = "CD4") +
theme_minimal()
boxplot_infectionsurv_plot_treatment <- ggsurvplot (
survfit(Surv(time, death) ~ treatment, data = df),
data = df,
pval = TRUE,
conf.int = TRUE,
title = "Courbe de Survie par Traitement",
xlab = " Temps (mois)",
ylab = "Probabilité de survie",
legend.title = "Type de Traitement",
palette = RColorBrewer::brewer.pal(3, "Set1")
)
surv_plot_treatment## Call:
## survdiff(formula = Surv(time, death) ~ treatment, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=ddC 717 200 213 0.843 1.75
## treatment=ddI 688 212 199 0.906 1.75
##
## Chisq= 1.8 on 1 degrees of freedom, p= 0.2
ggsave("surv_plot_treatment.png", plot = surv_plot_treatment$plot, width = 8, height = 6, dpi = 300)fit_sex <- survfit(Surv(time, death) ~ sex, data = df)
survplot_sex = ggsurvplot (
fit_sex,
data = df,
pval = TRUE,
conf.int = TRUE,
title = "Courbe de Survie par Genre",
xlab = " Temps (mois)",
ylab = "Probabilité de survie",
legend.title = "Genre",
palette = "viridis"
)
survplot_sex## Call:
## survdiff(formula = Surv(time, death) ~ sex, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=female 117 33 32.5 0.007689 0.00838
## sex=male 1288 379 379.5 0.000658 0.00838
##
## Chisq= 0 on 1 degrees of freedom, p= 0.9
fit_cd4 <- survfit(Surv(time, death) ~ cd4_cat, data = df)
surv_plot_cd4 <- ggsurvplot (
fit_cd4,
data = df,
pval = TRUE,
conf.int = TRUE,
title = "Survival Curve based on CD4 Cell Count",
xlab = " Times (months)",
ylab = "Survival probability",
legend.title = "CD4 Cell count. \n",
palette = c("#E7B800", "#2E9FDF", "darkgreen")
)
surv_plot_cd4fit_infection <- survfit(Surv(time, death) ~ prev_infection, data = df)
surv_plot_infection <- ggsurvplot (
fit_infection,
data = df,
pval = TRUE,
conf.int = TRUE,
title = "Courbe de Survie selon les antécédents d'infection (SIDA)",
xlab = " Temps (mois)",
ylab = "Probabilité de survie",
legend.title = "Antécédents d'Infection \n",
palette = c("#E7B800", "#2E9FDF")
)
logrank_infection <- survdiff(Surv(time, death) ~ prev_infection, data = df)
logrank_infection## Call:
## survdiff(formula = Surv(time, death) ~ prev_infection, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## prev_infection=AIDS 863 344 232 53.9 125
## prev_infection=noAIDS 542 68 180 69.6 125
##
## Chisq= 125 on 1 degrees of freedom, p= <2e-16
ggsave("surv_plot_infection.png", plot = surv_plot_infection$plot, width = 8, height = 6, dpi = 300)surv_plot_azt <- ggsurvplot (
survfit(Surv(time, death) ~ azt, data = df),
data = df,
pval = TRUE,
conf.int = TRUE,
title = "Courbe de Survie par Genre",
xlab = " Temps (mois)",
ylab = "Probabilité de survie",
legend.title = "Genre",
palette = RColorBrewer::brewer.pal(3, "Dark2")
)
surv_plot_azt## Call:
## survdiff(formula = Surv(time, death) ~ treatment, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=ddC 717 200 213 0.843 1.75
## treatment=ddI 688 212 199 0.906 1.75
##
## Chisq= 1.8 on 1 degrees of freedom, p= 0.2
train <- createDataPartition(df$death, p = 0.7, list = FALSE)
train_data <- df[train,]
test_data <- df[-train,]
cat("Distribution de la variable 'death' dans l'ensemble d'entraînement :\n")## Distribution de la variable 'death' dans l'ensemble d'entraînement :
##
## 0 1
## 698 286
##
## Distribution de la variable 'death' dans l'ensemble de test :
##
## 0 1
## 295 126
train_data$prev_infection <- relevel(train_data$prev_infection, ref = "AIDS")
train_data$azt <- relevel(train_data$azt, ref = "failure")
cox <- coxph(Surv(time, death) ~prev_infection + azt, data = train_data, x = TRUE)
cox## Call:
## coxph(formula = Surv(time, death) ~ prev_infection + azt, data = train_data,
## x = TRUE)
##
## coef exp(coef) se(coef) z p
## prev_infectionnoAIDS -1.53564 0.21532 0.18100 -8.484 <2e-16
## aztintolerance 0.03934 1.04012 0.12945 0.304 0.761
##
## Likelihood ratio test=113.5 on 2 df, p=< 2.2e-16
## n= 984, number of events= 286
## chisq df p
## prev_infection 3.23 1 0.072
## azt 2.67 1 0.102
## GLOBAL 4.12 2 0.128
Cela signifie que les effets des covariables sur le risque de survie sont constants dans le temps. Cela valide l’utilisation d’un modèle de Cox avec ces covariables.
train_data$azt <- factor(train_data$azt)
train_data$prev_infection <- factor(train_data$prev_infection)
cox_nl <- coxph(Surv(time, death) ~ prev_infection + azt + ns(cd4, df = 2), data = train_data)
summary(cox_nl)## Call:
## coxph(formula = Surv(time, death) ~ prev_infection + azt + ns(cd4,
## df = 2), data = train_data)
##
## n= 984, number of events= 286
##
## coef exp(coef) se(coef) z Pr(>|z|)
## prev_infectionnoAIDS -1.10477 0.33129 0.18551 -5.955 2.59e-09 ***
## aztintolerance 0.09237 1.09677 0.12901 0.716 0.473990
## ns(cd4, df = 2)1 -3.24233 0.03907 0.47228 -6.865 6.64e-12 ***
## ns(cd4, df = 2)2 -3.30297 0.03677 0.95808 -3.447 0.000566 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## prev_infectionnoAIDS 0.33129 3.0185 0.230303 0.4765
## aztintolerance 1.09677 0.9118 0.851728 1.4123
## ns(cd4, df = 2)1 0.03907 25.5934 0.015483 0.0986
## ns(cd4, df = 2)2 0.03677 27.1931 0.005624 0.2405
##
## Concordance= 0.71 (se = 0.015 )
## Likelihood ratio test= 174.1 on 4 df, p=<2e-16
## Wald test = 114.3 on 4 df, p=<2e-16
## Score (logrank) test = 150.6 on 4 df, p=<2e-16
## chisq df p
## prev_infection 4.01 1 0.045
## azt 2.33 1 0.127
## ns(cd4, df = 2) 3.16 2 0.206
## GLOBAL 7.65 4 0.105
## Analysis of Deviance Table
## Cox model: response is Surv(time, death)
## Model 1: ~ prev_infection + azt
## Model 2: ~ prev_infection + azt + ns(cd4, df = 2)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1811.3
## 2 -1781.0 60.644 2 6.781e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
patient_1 <- data.frame(prev_infection = "noAIDS", azt = "failure")
patient_2 <- data.frame(prev_infection = "AIDS", azt = "intolerance")
patient_1_nl <- data.frame(prev_infection = "noAIDS", azt = "failure", cd4 = 20)
patient_2_nl <- data.frame(prev_infection = "AIDS", azt = "intolerance", cd4 = 2)
surv_patient_1_cox <- survfit(cox, newdata = patient_1)
surv_patient_2_cox <- survfit(cox, newdata = patient_2)
surv_patient_1_cox_nl <- survfit(cox_nl, newdata = patient_1_nl)
surv_patient_2_cox_nl <- survfit(cox_nl, newdata = patient_2_nl)
prob_18_months_patient_1 <- summary(surv_patient_1_cox, times = 18)$surv
prob_18_months_patient_2 <- summary(surv_patient_2_cox, times = 18)$surv
prob_18_months_patient_1_nl <- summary(surv_patient_1_cox_nl, times = 18)$surv
prob_18_months_patient_2_nl <- summary(surv_patient_2_cox_nl, times = 18)$surv
cat("The probability that these two patients will survive beyond 18 months with the Linear Model : \n")## The probability that these two patients will survive beyond 18 months with the Linear Model :
## First Patient : - 0.8760176
## Second Patient : - 0.5275944
cat("\nThe probability that these two patients will survive beyond 18 months with the Non linear Model : \n")##
## The probability that these two patients will survive beyond 18 months with the Non linear Model :
## First Patient : - 0.9812692
## Second Patient : - 0.3875033
patient_1_12_months <- data.frame(prev_infection = "noAIDS", azt = "failure", cd4 = 20)
patient_2_12_months <- data.frame(prev_infection = "AIDS", azt = "intolerance", cd4 = 2)
surv_patient_1_12_months <- survfit(cox_nl, newdata = patient_1_12_months)
surv_patient_2_12_months <- survfit(cox_nl, newdata = patient_2_12_months)
prob_patient_1_12_months <- summary(surv_patient_1_12_months, times = 12)$surv
prob_patient_2_12_months <- summary(surv_patient_2_12_months, times = 12)$surv
prob_patient_1_18_months <- summary(surv_patient_1_12_months, times = 18)$surv
prob_patient_2_18_months <- summary(surv_patient_2_12_months, times = 18)$surv
prob_cond_patient_1 <- prob_patient_1_18_months / prob_patient_1_12_months
prob_cond_patient_2 <- prob_patient_2_18_months / prob_patient_2_12_months
cat("First Patient - Conditional probability of survival between 12 and 18 months : ", prob_cond_patient_1, "\n")## First Patient - Conditional probability of survival between 12 and 18 months : 0.9911895
cat("Individu âgé - Probabilité conditionnelle de survie entre 12 et 18 mois : ", prob_cond_patient_2, "\n")## Individu âgé - Probabilité conditionnelle de survie entre 12 et 18 mois : 0.6416609
cox_roc <- survivalROC(
Stime = test_data$time,
status = test_data$death,
marker = cox_pred,
predict.time = 18,
method = "KM"
)
cox_auc <- cox_roc$AUC
rf_roc <- survivalROC(
Stime = test_data$time,
status = test_data$death,
marker = rf_pred_probs,
predict.time = 18,
method = "KM"
)
rf_auc <- rf_roc$AUC
rf_opti_roc <- survivalROC(
Stime = test_data$time,
status = test_data$death,
marker = rf_pred_opti_probs,
predict.time = 18,
method = "KM"
)
rf_opti_auc <- rf_opti_roc$AUC
roc_data_cox <- data.frame(
FPR = cox_roc$FP,
TPR = cox_roc$TP,
Model = "Cox",
AUC = round(cox_roc$AUC, 3)
)
roc_data_rf <- data.frame(
FPR = rf_roc$FP,
TPR = rf_roc$TP,
Model = "RF",
AUC = round(rf_roc$AUC, 3)
)
roc_data_rf_opti <- data.frame(
FPR = rf_opti_roc$FP,
TPR = rf_opti_roc$TP,
Model = "RF Optimisé",
AUC = round(rf_opti_roc$AUC, 3)
)
roc_data <- rbind(roc_data_cox, roc_data_rf, roc_data_rf_opti)
roc_curve = ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
geom_line(size = 1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(
title = paste("ROC Curves (AUC)"),
x = "False Positive Rate (FPR)",
y = "True Positive Rate (TPR)",
subtitle = paste("Cox AUC =", cox_auc, "\nRF AUC =", rf_auc, "\nRF Optimized, AUC =", rf_opti_auc)
) +
theme_minimal() +
theme(legend.title = element_blank())## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
times <- seq(0, max(test_data$time), by = 5)
brier_score <- Score(
object = list("Cox" = cox),
formula = Surv(time, death) ~ 1,
data = test_data,
times = times,
metrics = "Brier"
)## num [1:5] 0 5 10 15 20
## Classes 'data.table' and 'data.frame': 10 obs. of 6 variables:
## $ model: Factor w/ 2 levels "Null model","Cox": 1 1 1 1 1 2 2 2 2 2
## $ times: num 0 5 10 15 20 0 5 10 15 20
## $ Brier: num 0 0.0474 0.1289 0.2024 0.2357 ...
## $ se : num 0 0.00956 0.01219 0.01001 0.00875 ...
## $ lower: num 0 0.0287 0.105 0.1828 0.2186 ...
## $ upper: num 0 0.0661 0.1528 0.222 0.2529 ...
## - attr(*, ".internal.selfref")=<externalptr>
embedded <- function(brier_results) {
times <- brier_results$times
brier_scores <- brier_results$Brier$score
intervals <- diff(times)
weights <- c(intervals, 0) / sum(intervals)
# Calculer le score intégré
embedded_score <- sum(brier_scores * weights, na.rm = TRUE)
return(embedded_score)
}
embedded_score <- embedded(brier_score)## Warning in Ops.factor(left, right): '*' not meaningful for factors
## Embedded Score: 15.57423